# coding=utf-8
import re
import urllib2


class bdtb:
    def __init__(self, base_url, see_lz):
        self.base_url = base_url
        self.see_lz = '?see_lz=' + str(see_lz)

    def get_page(self, page_num):
        try:
            url = self.base_url + self.see_lz + "&pn=" + str(page_num)
            request = urllib2.Request(url)
            response = urllib2.urlopen(request)
            return response.read().decode('utf-8')
        except urllib2.URLError, e:
            if hasattr(e, "reason"):
                print u"失败", e.reason
                return None

    def get_content(self, page):
        pattern = re.compile('<div id="post_content_.*?>(.*?)</div>', re.S)
        items = re.findall(pattern, page)
        return items


base_url = 'http://tieba.baidu.com/p/3138733512'
bdtb = bdtb(base_url, 1)
page = bdtb.get_page(1)
items = bdtb.get_content(page)
cnt = 0
for item in items:
    cnt = cnt + 1
    item = re.sub('<img.*?>', "", item)
    item = re.sub(' ', "", item)
    item = re.sub('<tr>|<div>|</div>|</p>', "\n", item)
    item = re.sub('<td>', '\t', item)
    item = re.sub('<p.*?>', "\n  ", item)
    item = re.sub('<br><br>|<br>', "\n", item)
    item = re.sub('<.*?>', "", item)
    item = item.strip()
    print str(cnt) + u"项------->\n" + item
